# R Options
options(stringsAsFactors=FALSE)

# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator
library(reticulate) # required for 'leiden' clustering
library(enrichR) # functional enrichment

# Other libraries we use
# Knit: knitr
# Data handling: dplyr, tidyr, purrr
# Tables: kableExtra
# Plots: ggsci, ggpubr
# IO: openxlsx
# Annotation: biomaRt
# DEG: mast
# Functional enrichment: enrichR

# Source plotting functions
source("R/functions_io.R")
source("R/functions_plotting.R")
source("R/functions_analysis.R")
source("R/functions_util.R")

# Knitr default options
knitr::opts_chunk$set(echo=TRUE, cache=FALSE, message=FALSE, warning=FALSE, fig.width=10)

# Potentially needed for clustering, umap, other python packages
use_python('/usr/bin/python')

Dataset description

Project-specific parameters

Open this code chunk to read all parameters that are set specifically for your project.

param = list()

# Project ID
param$project = "pbmc"

# Input data path in case Cell Ranger was run 
param$path_data = "test_datasets/10x_pbmc_1k_healthyDonor_v3Chemistry/counts/"
param$file_mapping_stats = "test_datasets/10x_pbmc_1k_healthyDonor_v3Chemistry/pbmc_1k_v3_metrics_summary.csv"
param$file_annot = paste0(param$path_data, "/hsapiens_gene_ensembl.annot.csv")

# Project-specific paths
param$path_out = "test_datasets/10x_pbmc_1k_healthyDonor_v3Chemistry/results"
if (!file.exists(param$path_out)) dir.create(param$path_out, recursive=TRUE)

# Prefix of mitochondrial genes 
param$mt = "^MT-"

# Biomart dataset to use for gene name translations
param$mart_dataset = "hsapiens_gene_ensembl"

# Biomart attributes for the annotation if no csv is given
param$mart_attributes = c("ensembl_gene_id_version", "hgnc_symbol", "chromosome_name", "start_position", "end_position", "transcript_length", "percentage_gene_gc_content", "gene_biotype", "strand", "description")

# The number of PCs to use; adjust this parameter based on JackStraw and Elbowplot 
param$pc_n = 7

# Resolution of clusters; low values will lead to fewer clusters of cells 
param$cluster_resolution=0.5

# Thresholds to define differentially expressed genes 
param$padj = 0.05
param$log2fc = log2(1.5)

# Marker genes based on literature 
# https://icb-scanpy-tutorials.readthedocs-hosted.com/en/latest/visualizing-marker-genes.html
param$known_markers = list()
param$known_markers[["bcell"]] = c("CD79A", "MS4A1")
param$known_markers[["tcell"]] = "CD3D"
param$known_markers[["tcell.cd8+"]] = c("CD8A", "CD8B")
param$known_markers[["nk"]] = c("GNLY", "NKG7")
param$known_markers[["myeloid"]] = c("CST3", "LYZ")
param$known_markers[["monocytes"]] = "FCGR3A"
param$known_markers[["dendritic"]] = "FCER1A"

# Enrichr databases of interest
param$enrichr_dbs = c("GO_Molecular_Function_2018", "GO_Biological_Process_2018", "GO_Cellular_Component_2018")

# Main color to use for plots
param$col = "palevioletred"

Read data

Read and print mapping statistics

We begin by printing mapping statistics that have been produced prior to this workflow.

Mapping statistics
Value
Estimated Number of Cells 1,222
Mean Reads per Cell 54,502
Median Genes per Cell 1,919
Number of Reads 66,601,887
Valid Barcodes 97.4%
Sequencing Saturation 70.8%
Q30 Bases in Barcode 94.1%
Q30 Bases in RNA Read 90.2%
Q30 Bases in Sample Index 91.1%
Q30 Bases in UMI 92.7%
Reads Mapped to Genome 95.4%
Reads Mapped Confidently to Genome 92.4%
Reads Mapped Confidently to Intergenic Regions 4.8%
Reads Mapped Confidently to Intronic Regions 31.1%
Reads Mapped Confidently to Exonic Regions 56.5%
Reads Mapped Confidently to Transcriptome 53.7%
Reads Mapped Antisense to Gene 1.0%
Fraction Reads in Cells 94.9%
Total Genes Detected 18,391
Median UMI Counts per Cell 6,628

Setup the Seurat object

We next read the scRNA-seq counts table to initialise a Seurat object.

## An object of class Seurat 
## 15247 features across 1222 samples within 1 assay 
## Active assay: RNA (15247 features)

Read gene annotation

We finally read additional gene annotation from Ensembl, and translate Ensembl IDs to Entrez gene symbols. The resulting table is written to file.

# Read feature IDs: Ensembl ID, 10X gene symbol 
gene.ids = read.delim(paste0(param$path_data, "features.tsv.gz"), header=FALSE)
gene.ids = gene.ids[gene.ids[,3]=="Gene Expression", 1:2]
colnames(gene.ids) = c("Ensembl", "GeneSymbol")

# Seurat does this as well, so we need to replicate it
gene.ids[,"GeneSymbol"] = make.unique(gene.ids[,"GeneSymbol"]) 

# When you call "CreateSeuratObject", underscores "_" in gene names are replaced with minus "-"
gene.ids = cbind(gene.ids, GeneSymbolEdited=gsub(gene.ids[,"GeneSymbol"], pattern="_", replacement="-", fixed=TRUE))

# Retrieve Entrez Gene Symbols; this is needed for EnrichR functional enrichment analysis
# mart = useMart("ensembl", dataset=param$mart_dataset) # At point of merge, this did not work, biomart was unresponsive
#mart = useEnsembl("ensembl", dataset=param$mart_dataset, mirror="www") 
#mapping = getBM(
#  filters="ensembl_gene_id",
#  attributes=c("ensembl_gene_id", "entrezgene_accession"),
#  values=gene.ids[,"Ensembl"],
#  mart=mart)

# Note that we get the first Entrez Symbol that matches the Ensembl ID
#mapping.match = match(gene.ids[,"Ensembl"], mapping[,"ensembl_gene_id"])
#gene.ids = cbind(gene.ids, EntrezSymbol=mapping[mapping.match,"entrezgene_accession"])

# If useMart hangs again, we can outcomment the upper part, and use this hack instead
# This works for human
gene.ids = cbind(gene.ids, EntrezSymbol=gene.ids[,"GeneSymbol"])

# Write table
write.table(gene.ids, file=paste0(param$path_out, "/GeneIds.txt"), sep="\t", row.names=TRUE, col.names=TRUE, quote=FALSE)

# Create translation table
symbol_to_ensembl = setNames(gene.ids[,"Ensembl"], gene.ids[,"GeneSymbolEdited"])
symbol_to_entrez = setNames(gene.ids[,"EntrezSymbol"], gene.ids[,"GeneSymbolEdited"])

# Read Ensembl annotation from csv or from ensembl and a tab seperated csv will be created
if (!is.null(param$file_annot) && file.exists(param$file_annot)){
annot_ensembl = read.delim(param$file_annot)
} else{
  if (is.null(param$file_annot)){
    param$file_annot = paste0(param$path_out, param$mart_dataset, '.annot.csv')
  }
  annot.mart = useEnsembl("ensembl", dataset = param$mart_dataset, mirror="www")
  annot_ensembl = getBM(mart = annot.mart, attributes = param$mart_attributes)
  write.table(annot_ensembl, file = param$file_annot, sep = '\t', col.names = TRUE, row.names = FALSE, append = FALSE)
  print(paste0("Gene annotation file was created at: ", param$file_annot))
}

Pre-processing

Quality control

We start the analysis by removing unwanted cells from the dataset. Three commonly used QC metrics include the number of unique genes detected in each cell (“nFeature”), the total number of molecules detected in each cell (“nCount”), and the percentage of reads that map to the mitochrondrial genome (“percent.mt”).

Meta-data, top 5 rows
orig.ident nCount_RNA nFeature_RNA percent.mt
AAACCCAAGGAGAGTA pbmc 8286 2618 10.777215
AAACGCTTCAGCCCAG pbmc 5509 1805 7.968778
AAAGAACAGACGACTG pbmc 4280 1559 6.191589
AAAGAACCAATGGCAG pbmc 2754 1225 5.991285
AAAGAACGTCTGCAAT pbmc 6589 1828 6.617089

## An object of class Seurat 
## 15247 features across 1113 samples within 1 assay 
## Active assay: RNA (15247 features)

Normalisation, feature selection and scaling

Feature selection: For downstream analysis it is beneficial to focus on genes that exhibit high cell-to-cell variation, that is they are highly expressed in some cells and lowly expressed in others.

Scaling: To be able to compare normalised gene counts between genes, gene counts are further scaled to have zero mean and unit variance (z-score). This way, genes are equally weighted for downstream analysis.

Dimensionality reduction

A single-cell dataset of 20,000 genes and 5,000 cells has 20,000 dimensions. The biological manifold however can be described by far fewer dimensions than the number of genes. Dimension reduction methods aim to find these dimensions. There are two general purposes for dimension reduction methods: to summarise a dataset, and to visualise a dataset.

We use Principal Component Analysis (PCA) to summarise a dataset, overcoming noise and reducing the data to its essential components. Each principal component (PC) represents a “metafeature” that combines information across a correlated gene set. Later, we use Uniform Manifold Approximation and Projection (UMAP) to visualise the dataset, placing similar cells together in 2D space, see below.

To decide how many PCs to include in downstream analyses, we visualize cells and genes that define the PCA.

Dimensionality of the dataset

We next need to decide how many PCs we want to use for downstream analyses. The following two plots are designed to help us make an informed decision.

The first plot is based on the “JackStraw” procedure: parts of the data is repeatedly randomly permuted and PCA is rerun, generating a “null distribution” of feature scores. Significant PCs are those with a strong enrichment of low p-value features.

The second plot is an “Elbow plot”: PCs are ranked based on the percentage of variance they explain.

For your dataset, we decided to go for 7 PCs.

Downstream analysis

Clustering

Seurat’s clustering method first constructs a graph structure, where nodes are cells and edges are drawn between cells with similar gene expression patterns. Technically speaking, Seurat first constructs a K-nearest neighbor (KNN) graph based on Euclidean distance in PCA space, and refines edge weights between cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To partition the graph into highly interconnected parts, cells are iteratively grouped together using the Leiden algorithm.

Visualisation with UMAP

We use a UMAP to visualise and explore a dataset. The goal is to place similar cells together in 2D space, and learn about the biology underlying the data. Cells are color-coded according to the graph-based clustering, and clusters typcially co-localise on the UMAP.

Take care not to mis-read a UMAP:

  • Parameters influence the plot (we use defaults here)
  • Cluster sizes relative to each other mean nothing, since the method has a local notion of distance
  • Distances between clusters might not mean anything
  • You may need more than one plot

For a nice read to intuitively understand UMAP, see https://pair-code.github.io/understanding-umap/.

Differentially expressed genes

We next identify genes that are differentially expressed in one cluster compared to all other clusters. Additional gene annotation is added, and the resulting tables are written to file.

Top 2 DEGs per cell cluster
p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene
0 3.8292753 1.000 0.156 0 1 S100A8
0 3.4714042 1.000 0.204 0 1 S100A9
0 1.3654774 0.938 0.255 0 2 IL7R
0 1.0531971 0.964 0.341 0 2 TRAC
0 3.6609018 0.871 0.025 0 3 IGHM
0 4.9139410 0.796 0.237 0 3 IGKC
0 0.9279829 0.759 0.154 0 4 CCR7
0 0.9427734 0.927 0.499 0 4 NOSIP
0 2.0182653 0.837 0.049 0 5 GZMK
0 1.8890308 0.748 0.140 0 5 KLRB1
0 1.7451108 1.000 0.293 0 6 CST3
0 1.6522173 1.000 0.514 0 6 HLA-DPA1
0 3.8593508 0.980 0.073 0 7 GNLY
0 2.9277180 1.000 0.187 0 7 NKG7
0 5.0496622 1.000 0.237 0 8 NRGN
0 5.9002407 1.000 0.042 0 8 PPBP

Visualisation of differentially expressed genes

The following plots are exemplary to how we can visualize differentially expressed genes using the “Seurat” R-package. The selected genes are the top differentially expressed genes for all clusters, respectively.

Functional enrichment analysis

To gain first insights into potential functions of cells in a cluster, we test for over-representation of functional terms amongst up- and down-regulated genes of each cluster. Over-represented terms are written to file.

We first translate gene symbols of up- and down-regulated genes per cluster into Entrez gene symbols, and then use the “enrichR” R-package to access the “Enrichr” website https://amp.pharm.mssm.edu/Enrichr/. You can choose to test functional enrichment from a wide range of databases:

Enrichr databases
geneCoverage genesPerTerm libraryName link numTerms
13362 275 Genome_Browser_PWMs http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/ 615
27884 1284 TRANSFAC_and_JASPAR_PWMs http://jaspar.genereg.net/html/DOWNLOAD/ 326
6002 77 Transcription_Factor_PPIs 290
47172 1370 ChEA_2013 http://amp.pharm.mssm.edu/lib/cheadownload.jsp 353
47107 509 Drug_Perturbations_from_GEO_2014 http://www.ncbi.nlm.nih.gov/geo/ 701
21493 3713 ENCODE_TF_ChIP-seq_2014 http://genome.ucsc.edu/ENCODE/downloads.html 498
1295 18 BioCarta_2013 https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways 249
3185 73 Reactome_2013 http://www.reactome.org/download/index.html 78
2854 34 WikiPathways_2013 http://www.wikipathways.org/index.php/Download_Pathways 199
15057 300 Disease_Signatures_from_GEO_up_2014 http://www.ncbi.nlm.nih.gov/geo/ 142
4128 48 KEGG_2013 http://www.kegg.jp/kegg/download/ 200
34061 641 TF-LOF_Expression_from_GEO http://www.ncbi.nlm.nih.gov/geo/ 269
7504 155 TargetScan_microRNA http://www.targetscan.org/cgi-bin/targetscan/data_download.cgi?db=vert_61 222
16399 247 PPI_Hub_Proteins http://amp.pharm.mssm.edu/X2K 385
12753 57 GO_Molecular_Function_2015 http://www.geneontology.org/GO.downloads.annotations.shtml 1136
23726 127 GeneSigDB http://genesigdb.org/genesigdb/downloadall.jsp 2139
32740 85 Chromosome_Location http://software.broadinstitute.org/gsea/msigdb/index.jsp 386
13373 258 Human_Gene_Atlas http://biogps.org/downloads/ 84
19270 388 Mouse_Gene_Atlas http://biogps.org/downloads/ 96
13236 82 GO_Cellular_Component_2015 http://www.geneontology.org/GO.downloads.annotations.shtml 641
14264 58 GO_Biological_Process_2015 http://www.geneontology.org/GO.downloads.annotations.shtml 5192
3096 31 Human_Phenotype_Ontology http://www.human-phenotype-ontology.org/ 1779
22288 4368 Epigenomics_Roadmap_HM_ChIP-seq http://www.roadmapepigenomics.org/ 383
4533 37 KEA_2013 http://amp.pharm.mssm.edu/lib/keacommandline.jsp 474
10231 158 NURSA_Human_Endogenous_Complexome https://www.nursa.org/nursa/index.jsf 1796
2741 5 CORUM http://mips.helmholtz-muenchen.de/genre/proj/corum/ 1658
5655 342 SILAC_Phosphoproteomics http://amp.pharm.mssm.edu/lib/keacommandline.jsp 84
10406 715 MGI_Mammalian_Phenotype_Level_3 http://www.informatics.jax.org/ 71
10493 200 MGI_Mammalian_Phenotype_Level_4 http://www.informatics.jax.org/ 476
11251 100 Old_CMAP_up http://www.broadinstitute.org/cmap/ 6100
8695 100 Old_CMAP_down http://www.broadinstitute.org/cmap/ 6100
1759 25 OMIM_Disease http://www.omim.org/downloads 90
2178 89 OMIM_Expanded http://www.omim.org/downloads 187
851 15 VirusMINT http://mint.bio.uniroma2.it/download.html 85
10061 106 MSigDB_Computational http://www.broadinstitute.org/gsea/msigdb/collections.jsp 858
11250 166 MSigDB_Oncogenic_Signatures http://www.broadinstitute.org/gsea/msigdb/collections.jsp 189
15406 300 Disease_Signatures_from_GEO_down_2014 http://www.ncbi.nlm.nih.gov/geo/ 142
17711 300 Virus_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 323
17576 300 Virus_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 323
15797 176 Cancer_Cell_Line_Encyclopedia https://portals.broadinstitute.org/ccle/home 967
12232 343 NCI-60_Cancer_Cell_Lines http://biogps.org/downloads/ 93
13572 301 Tissue_Protein_Expression_from_ProteomicsDB https://www.proteomicsdb.org/ 207
6454 301 Tissue_Protein_Expression_from_Human_Proteome_Map http://www.humanproteomemap.org/index.php 30
3723 47 HMDB_Metabolites http://www.hmdb.ca/downloads 3906
7588 35 Pfam_InterPro_Domains ftp://ftp.ebi.ac.uk/pub/databases/interpro/ 311
7682 78 GO_Biological_Process_2013 http://www.geneontology.org/GO.downloads.annotations.shtml 941
7324 172 GO_Cellular_Component_2013 http://www.geneontology.org/GO.downloads.annotations.shtml 205
8469 122 GO_Molecular_Function_2013 http://www.geneontology.org/GO.downloads.annotations.shtml 402
13121 305 Allen_Brain_Atlas_up http://www.brain-map.org/ 2192
26382 1811 ENCODE_TF_ChIP-seq_2015 http://genome.ucsc.edu/ENCODE/downloads.html 816
29065 2123 ENCODE_Histone_Modifications_2015 http://genome.ucsc.edu/ENCODE/downloads.html 412
280 9 Phosphatase_Substrates_from_DEPOD http://www.koehn.embl.de/depod/ 59
13877 304 Allen_Brain_Atlas_down http://www.brain-map.org/ 2192
15852 912 ENCODE_Histone_Modifications_2013 http://genome.ucsc.edu/ENCODE/downloads.html 109
4320 129 Achilles_fitness_increase http://www.broadinstitute.org/achilles 216
4271 128 Achilles_fitness_decrease http://www.broadinstitute.org/achilles 216
10496 201 MGI_Mammalian_Phenotype_2013 http://www.informatics.jax.org/ 476
1678 21 BioCarta_2015 https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways 239
756 12 HumanCyc_2015 http://humancyc.org/ 125
3800 48 KEGG_2015 http://www.kegg.jp/kegg/download/ 179
2541 39 NCI-Nature_2015 http://pid.nci.nih.gov/ 209
1918 39 Panther_2015 http://www.pantherdb.org/ 104
5863 51 WikiPathways_2015 http://www.wikipathways.org/index.php/Download_Pathways 404
6768 47 Reactome_2015 http://www.reactome.org/download/index.html 1389
25651 807 ESCAPE http://www.maayanlab.net/ESCAPE/ 315
19129 1594 HomoloGene http://www.ncbi.nlm.nih.gov/homologene 12
23939 293 Disease_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 839
23561 307 Disease_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 839
23877 302 Drug_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 906
15886 9 Genes_Associated_with_NIH_Grants https://grants.nih.gov/grants/oer.htm 32876
24350 299 Drug_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 906
3102 25 KEA_2015 http://amp.pharm.mssm.edu/Enrichr 428
31132 298 Gene_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 2460
30832 302 Gene_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 2460
48230 1429 ChEA_2015 http://amp.pharm.mssm.edu/Enrichr 395
5613 36 dbGaP http://www.ncbi.nlm.nih.gov/gap 345
9559 73 LINCS_L1000_Chem_Pert_up https://clue.io/ 33132
9448 63 LINCS_L1000_Chem_Pert_down https://clue.io/ 33132
16725 1443 GTEx_Tissue_Sample_Gene_Expression_Profiles_down http://www.gtexportal.org/ 2918
19249 1443 GTEx_Tissue_Sample_Gene_Expression_Profiles_up http://www.gtexportal.org/ 2918
15090 282 Ligand_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 261
16129 292 Aging_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 286
15309 308 Aging_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 286
15103 318 Ligand_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 261
15022 290 MCF7_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 401
15676 310 MCF7_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 401
15854 279 Microbe_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 312
15015 321 Microbe_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 312
3788 159 LINCS_L1000_Ligand_Perturbations_down https://clue.io/ 96
3357 153 LINCS_L1000_Ligand_Perturbations_up https://clue.io/ 96
12668 300 L1000_Kinase_and_GPCR_Perturbations_down https://clue.io/ 3644
12638 300 L1000_Kinase_and_GPCR_Perturbations_up https://clue.io/ 3644
8973 64 Reactome_2016 http://www.reactome.org/download/index.html 1530
7010 87 KEGG_2016 http://www.kegg.jp/kegg/download/ 293
5966 51 WikiPathways_2016 http://www.wikipathways.org/index.php/Download_Pathways 437
15562 887 ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X 104
17850 300 Kinase_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 285
17660 300 Kinase_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 285
1348 19 BioCarta_2016 http://cgap.nci.nih.gov/Pathways/BioCarta_Pathways 237
934 13 HumanCyc_2016 http://humancyc.org/ 152
2541 39 NCI-Nature_2016 http://pid.nci.nih.gov/ 209
2041 42 Panther_2016 http://www.pantherdb.org/pathway/ 112
5209 300 DrugMatrix https://ntp.niehs.nih.gov/drugmatrix/ 7876
49238 1550 ChEA_2016 http://amp.pharm.mssm.edu/Enrichr 645
2243 19 huMAP http://proteincomplexes.org/ 995
19586 545 Jensen_TISSUES http://tissues.jensenlab.org/ 1842
22440 505 RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO http://www.ncbi.nlm.nih.gov/geo/ 1302
8184 24 MGI_Mammalian_Phenotype_2017 http://www.informatics.jax.org/ 5231
18329 161 Jensen_COMPARTMENTS http://compartments.jensenlab.org/ 2283
15755 28 Jensen_DISEASES http://diseases.jensenlab.org/ 1811
10271 22 BioPlex_2017 http://bioplex.hms.harvard.edu/ 3915
10427 38 GO_Cellular_Component_2017 http://www.geneontology.org/ 636
10601 25 GO_Molecular_Function_2017 http://www.geneontology.org/ 972
13822 21 GO_Biological_Process_2017 http://www.geneontology.org/ 3166
8002 143 GO_Cellular_Component_2017b http://www.geneontology.org/ 816
10089 45 GO_Molecular_Function_2017b http://www.geneontology.org/ 3271
13247 49 GO_Biological_Process_2017b http://www.geneontology.org/ 10125
21809 2316 ARCHS4_Tissues http://amp.pharm.mssm.edu/archs4 108
23601 2395 ARCHS4_Cell-lines http://amp.pharm.mssm.edu/archs4 125
20883 299 ARCHS4_IDG_Coexp http://amp.pharm.mssm.edu/archs4 352
19612 299 ARCHS4_Kinases_Coexp http://amp.pharm.mssm.edu/archs4 498
25983 299 ARCHS4_TFs_Coexp http://amp.pharm.mssm.edu/archs4 1724
19500 137 SysMyo_Muscle_Gene_Sets http://sys-myo.rhcloud.com/ 1135
14893 128 miRTarBase_2017 http://mirtarbase.mbc.nctu.edu.tw/ 3240
17598 1208 TargetScan_microRNA_2017 http://www.targetscan.org/ 683
5902 109 Enrichr_Libraries_Most_Popular_Genes http://amp.pharm.mssm.edu/Enrichr 121
12486 299 Enrichr_Submissions_TF-Gene_Coocurrence http://amp.pharm.mssm.edu/Enrichr 1722
1073 100 Data_Acquisition_Method_Most_Popular_Genes http://amp.pharm.mssm.edu/Enrichr 12
19513 117 DSigDB http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/ 4026
14433 36 GO_Biological_Process_2018 http://www.geneontology.org/ 5103
8655 61 GO_Cellular_Component_2018 http://www.geneontology.org/ 446
11459 39 GO_Molecular_Function_2018 http://www.geneontology.org/ 1151
19741 270 TF_Perturbations_Followed_by_Expression http://www.ncbi.nlm.nih.gov/geo/ 1958
27360 802 Chromosome_Location_hg19 http://hgdownload.cse.ucsc.edu/downloads.html 36
13072 26 NIH_Funded_PIs_2017_Human_GeneRIF https://www.ncbi.nlm.nih.gov/pubmed/ 5687
13464 45 NIH_Funded_PIs_2017_Human_AutoRIF https://www.ncbi.nlm.nih.gov/pubmed/ 12558
13787 200 Rare_Diseases_AutoRIF_ARCHS4_Predictions https://amp.pharm.mssm.edu/geneshot/ 3725
13929 200 Rare_Diseases_GeneRIF_ARCHS4_Predictions https://www.ncbi.nlm.nih.gov/gene/about-generif 2244
16964 200 NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions https://www.ncbi.nlm.nih.gov/pubmed/ 12558
17258 200 NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions https://www.ncbi.nlm.nih.gov/pubmed/ 5684
10352 58 Rare_Diseases_GeneRIF_Gene_Lists https://www.ncbi.nlm.nih.gov/gene/about-generif 2244
10471 76 Rare_Diseases_AutoRIF_Gene_Lists https://amp.pharm.mssm.edu/geneshot/ 3725
12419 491 SubCell_BarCode http://www.subcellbarcode.org/ 104
19378 37 GWAS_Catalog_2019 https://www.ebi.ac.uk/gwas 1737
6201 45 WikiPathways_2019_Human https://www.wikipathways.org/ 472
4558 54 WikiPathways_2019_Mouse https://www.wikipathways.org/ 176
3264 22 TRRUST_Transcription_Factors_2019 https://www.grnpedia.org/trrust/ 571
7802 92 KEGG_2019_Human https://www.kegg.jp/ 308
8551 98 KEGG_2019_Mouse https://www.kegg.jp/ 303
12444 23 InterPro_Domains_2019 https://www.ebi.ac.uk/interpro/ 1071
9000 20 Pfam_Domains_2019 https://pfam.xfam.org/ 608
7744 363 DepMap_WG_CRISPR_Screens_Broad_CellLines_2019 https://depmap.org/ 558
6204 387 DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019 https://depmap.org/ 325
13420 32 MGI_Mammalian_Phenotype_Level_4_2019 http://www.informatics.jax.org/ 5261
14148 122 UK_Biobank_GWAS_v1 https://www.ukbiobank.ac.uk/tag/gwas/ 857
9813 49 BioPlanet_2019 https://tripod.nih.gov/bioplanet/ 1510
1397 13 ClinVar_2019 https://www.ncbi.nlm.nih.gov/clinvar/ 182
9116 22 PheWeb_2019 http://pheweb.sph.umich.edu/ 1161
17464 63 DisGeNET https://www.disgenet.org 9828
394 73 HMS_LINCS_KinomeScan http://lincs.hms.harvard.edu/kinomescan/ 148
11851 586 CCLE_Proteomics_2020 https://portals.broadinstitute.org/ccle 378
8189 421 ProteomicsDB_2020 https://www.proteomicsdb.org/ 913
18704 100 lncHUB_lncRNA_Co-Expression https://amp.pharm.mssm.edu/lnchub/ 3729
5605 39 Virus-Host_PPI_P-HIPSTer_2020 http://phipster.org/ 6715
5718 31 Elsevier_Pathway_Collection http://www.transgene.ru/disease-pathways/ 1721
14156 40 Table_Mining_of_CRISPR_Studies 802

Cell Cycle Effect

How much do gene expression profiles in your dataset reflect the cell cycle phases the single cells were in? We determine the effects of cell cycle heterogeneity by calculating a score for each cell based on its expression of G2M and S phase markers_ Scoring is based on the strategy described in Tirosh et al. 2016, and human gene names are translated using biomaRt.

Loupe Cell Browser integration

We export the UMAP 2D visualisation, metadata such as the cell clusters, and lists of differentially expressed genes, so you can open and work with these in the Loupe Cell Browser.

# Export UMAP coordinates
loupe_umap = as.data.frame(sc@reductions$umap@cell.embeddings)
loupe_umap$Barcode = gsub(pattern="_", replacement="-", x=rownames(loupe_umap), fixed=TRUE)
if (length(grep(pattern="-$", x=loupe_umap$Barcode, perl=TRUE))==0) loupe_umap$Barcode=paste0(loupe_umap$Barcode, "-1")
loupe_umap = loupe_umap[, c("Barcode", "UMAP_1", "UMAP_2")]
colnames(loupe_umap) = c("Barcode", "UMAP-1", "UMAP-2")
write.table(loupe_umap, file=paste0(param$path_out, "/Seurat2Loupe_Umap.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")

# Export categorical metadata
meta_to_export = c("orig.ident", "seurat_clusters", "Phase")
# if (run.hto) meta.to.export = c(meta.to.export, "HTO_maxID", "HTO_classification", "HTO_classification.global", "hash.ID")
loupe_meta = as.data.frame(sc@meta.data[,meta_to_export])
loupe_meta = cbind(Barcode=gsub(pattern="_", replacement="-", rownames(loupe_meta), fixed=TRUE), loupe_meta)
if (length(grep(pattern="-$", x=loupe_meta$Barcode, perl=TRUE))==0) loupe_meta$Barcode=paste0(loupe_meta$Barcode, "-1")
write.table(x=loupe_meta, file=paste0(param$path_out, "/Seurat2Loupe_metadata.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")

# Export gene sets
loupe_genesets = data.frame(List=paste0("DEG_up_cluster_", sc_markers_filt_up[,"cluster"]), 
                            Name=sc_markers_filt_up[,"gene"], 
                            Ensembl=symbol_to_ensembl[sc_markers_filt_up[,"gene"]])
loupe_genesets = rbind(loupe_genesets, 
                       data.frame(List=paste0("DEG_down_cluster_", sc_markers_filt_down[,"cluster"]), 
                            Name=sc_markers_filt_down[,"gene"], 
                            Ensembl=symbol_to_ensembl[sc_markers_filt_down[,"gene"]]))

genesets_to_export = list(genes_cc_s_phase=genes_s[,2], genes_cc_g2m_phase=genes_g2m[,2])
for (i in names(genesets_to_export)){
  tmp_genes = genesets_to_export[[i]]
  tmp_genes = tmp_genes[tmp_genes %in% names(symbol_to_ensembl)]
  loupe_genesets = rbind(loupe_genesets,
                         data.frame(List=i,
                                    Name=tmp_genes,
                                    Ensembl=symbol_to_ensembl[tmp_genes]))
}

write.table(loupe_genesets, file=paste0(param$path_out, "/Seurat2Loupe_genesets.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")

Output files

All files generated with this report are written into the Seurat output folder: